Insert a concise (max 200 word) exectutive summary. It should be a clear, interesting summary of main insights from the report.
library(tidyverse)
library(skimr)
library(naniar)
library(kableExtra)
library(lubridate)
library(plotly)
library(janitor)
library(scales)
raw = read_csv("MVC.csv") %>% clean_names()
# raw %>%
# select(vehicle_type_code_1, vehicle_type_code_2, vehicle_type_code_3, vehicle_type_code_4, vehicle_type_code_5) %>%
# vis_miss(warn_large_data = FALSE)
raw_1 = select(raw, -c(vehicle_type_code_3, vehicle_type_code_4, vehicle_type_code_5))
# raw_1 %>%
# select(contributing_factor_vehicle_1, contributing_factor_vehicle_2, contributing_factor_vehicle_3, contributing_factor_vehicle_4, contributing_factor_vehicle_5) %>%
# vis_miss(warn_large_data = FALSE)
raw_2 = select(raw_1, -c(contributing_factor_vehicle_3, contributing_factor_vehicle_4, contributing_factor_vehicle_5))
data_1 = raw_2[1:10]
data_2 = raw_2[11:23]
# data_1 %>% vis_miss(warn_large_data = FALSE)
data_1 = data_1 %>% select(-c(off_street_name))
# data_2 %>% vis_miss(warn_large_data = FALSE)
data = cbind(data_1, data_2) %>% drop_na()
data = data %>% mutate(`hour` = hour(crash_time))
# raw %>% filter(!is.na(contributing_factor_vehicle_4)) %>% filter(!is.na(contributing_factor_vehicle_5)) %>%
# filter(!is.na(contributing_factor_vehicle_1)) %>%
# filter(!is.na(contributing_factor_vehicle_2)) %>%
# filter(!is.na(contributing_factor_vehicle_3))
raw = raw %>% mutate(`hour` = hour(crash_time))
z1 = raw %>% group_by(hour) %>% summarise(count = n()) %>% mutate(percentage = percent((count/sum(count)), accuracy = 0.01)) %>% as.data.frame()
p = ggplot()+
geom_bar(data=z1, aes(x=hour,y=count),
stat="identity",position = "dodge", fill = "#093146")+
geom_point(data=z1,aes(x=hour,y=count), colour = "red") +
geom_line(data=z1,aes(x=hour,y=count, group = 1), colour = "red")+
theme_classic()+
labs(x = "Hour", y = "Count")
ggplotly(p)
group_raw = raw %>% drop_na(borough) %>% group_by(hour, borough) %>%
summarise(count = n())
tt = group_raw %>%
ggplot(aes(x= hour, y = count, color= borough)) +
geom_line() +
facet_wrap(.~borough, scales = "free_y")
ggplotly(tt)
data = data %>%
mutate(crash_date = mdy(`crash_date`))
data = data %>%
mutate(month = month(crash_date))
g = data %>% group_by(month)%>% summarise(count = n()) %>%
ggplot(aes(x = month, y = count)) +
geom_line()+
theme_classic()
ggplotly(g)
raw = raw %>%
mutate(crash_date = mdy(`crash_date`))
raw = raw %>%
mutate(weekday = weekdays(crash_date))
raw$weekday = raw$weekday %>%
ordered(levels=c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"))
z2 = raw %>% group_by(weekday) %>% summarise(count = n()) %>%
mutate(percentage = percent((count/sum(count)), accuracy = 0.01)) %>%
as.data.frame()
pppp = ggplot()+
geom_bar(data=z2, aes(x=weekday,y=count),
stat="identity",position = "dodge", fill = "#093146")+
geom_point(data=z2,aes(x=weekday,y=count), colour = "red") +
geom_line(data=z2,aes(x=weekday,y=count, group = 1), colour = "red")+
theme_classic()+
labs(x = "Hour", y = "Count")
ggplotly(pppp)
group_raw = raw %>% drop_na(borough) %>% group_by(weekday, borough) %>%
summarise(count = n())
tt = ggplot()+
geom_bar(data=group_raw, aes(x= weekday, y = count, color= borough), stat="identity",position = "dodge", fill = "#093146") +
geom_histogram() +
geom_point(data=group_raw,aes(x=weekday,y=count), colour = "red") +
geom_line(data=group_raw,aes(x=weekday,y=count, group = 1), colour = "red")+
facet_wrap(.~borough, scales = "free_y")
ggplotly(tt)
raw = raw %>%
mutate(month = month(crash_date))
raw %>% select(hour, month) %>% table() %>% as.data.frame() %>%
ggplot() +
aes(x=month, y=hour, fill=Freq) %>%
geom_tile()+scale_fill_gradient(low="white", high="darkblue")
raw %>% select(hour, weekday) %>% table() %>% as.data.frame() %>%
ggplot() +
aes(x=weekday, y=hour, fill=Freq) %>%
geom_tile()+scale_fill_gradient(low="white", high="darkblue")
raw %>% glimpse()
## Rows: 1,615,800
## Columns: 32
## $ crash_date <date> 2019-06-20, 2019-07-09, 2019-06-28, 201~
## $ crash_time <time> 02:37:00, 18:00:00, 21:00:00, 12:00:00,~
## $ borough <chr> "BROOKLYN", NA, "QUEENS", NA, "BROOKLYN"~
## $ zip_code <dbl> 11223, NA, 11418, NA, 11236, 10019, 1003~
## $ latitude <dbl> 40.59792, NA, 40.70757, 40.85667, 40.634~
## $ longitude <dbl> -73.96117, NA, -73.83620, -73.86569, -73~
## $ location <chr> "POINT (-73.961174 40.597916)", NA, "POI~
## $ on_street_name <chr> NA, "NORTHERN BOULEVARD", "METROPOLITAN ~
## $ cross_street_name <chr> NA, "CLEARVIEW EXPRESSWAY", "116 STREET"~
## $ off_street_name <chr> "2410 CONEY ISLAND AVENUE", NA, NA,~
## $ number_of_persons_injured <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0~
## $ number_of_persons_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_pedestrians_injured <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0~
## $ number_of_pedestrians_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_cyclist_injured <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_cyclist_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_motorist_injured <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ number_of_motorist_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ contributing_factor_vehicle_1 <chr> "Unspecified", "Following Too Closely", ~
## $ contributing_factor_vehicle_2 <chr> "Unspecified", "Passing or Lane Usage Im~
## $ contributing_factor_vehicle_3 <chr> NA, NA, "Unspecified", NA, NA, NA, NA, N~
## $ contributing_factor_vehicle_4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ contributing_factor_vehicle_5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ collision_id <dbl> 4155032, 4167185, 4161958, 4170995, 4155~
## $ vehicle_type_code_1 <chr> "Station Wagon/Sport Utility Vehicle", "~
## $ vehicle_type_code_2 <chr> "Station Wagon/Sport Utility Vehicle", "~
## $ vehicle_type_code_3 <chr> NA, NA, "Sedan", NA, NA, NA, NA, NA, NA,~
## $ vehicle_type_code_4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ vehicle_type_code_5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ hour <int> 2, 18, 21, 12, 16, 18, 15, 23, 15, 23, 1~
## $ weekday <ord> Thursday, Tuesday, Friday, Thursday, Thu~
## $ month <dbl> 6, 7, 6, 7, 6, 7, 7, 6, 6, 7, 7, 6, 7, 6~
dd = data %>% group_by(month, borough) %>%
summarise(
ped = sum(number_of_pedestrians_injured),
cyc = sum(number_of_cyclist_injured),
mot = sum(number_of_motorist_injured)
)
dd = dd %>% pivot_longer(c(-month, -borough))
ll = ggplot(dd,aes(x=month,
y=value,
fill=name, color = borough))+
geom_bar(stat='identity',
position=position_stack(reverse=T))+
facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
dd = data %>% group_by(month, borough) %>%
summarise(
ped = sum(number_of_pedestrians_injured),
cyc = sum(number_of_cyclist_injured),
mot = sum(number_of_motorist_injured)
)
dd = dd %>% pivot_longer(c(-month, -borough))
ll = ggplot(dd,aes(x=month,
y=value,
fill=name, color = name))+
geom_line()+
facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
ff = data %>% group_by(month, borough) %>%
summarise(
ped = sum(number_of_pedestrians_killed),
cyc = sum(number_of_cyclist_killed),
mot = sum(number_of_motorist_killed)
)
ff = ff %>% pivot_longer(c(-month, -borough))
ll = ggplot(ff,aes(x=month,
y=value,
fill=name, color = borough))+
geom_bar(stat='identity',
position=position_stack(reverse=T))+
facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
ff = data %>% group_by(month, borough) %>%
summarise(
ped = sum(number_of_pedestrians_killed),
cyc = sum(number_of_cyclist_killed),
mot = sum(number_of_motorist_killed)
)
ff = ff %>% pivot_longer(c(-month, -borough))
ll = ggplot(ff,aes(x=month,
y=value,
fill=name, color = name))+
geom_line()+
facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
library(plotly)
hg = raw %>% group_by(borough, contributing_factor_vehicle_1) %>% drop_na(borough) %>% summarise(count = n()) %>%ungroup() %>%
as.data.frame() %>%
arrange(desc(count))
hg = hg %>% arrange(desc(count)) %>% group_by(borough) %>%
summarise(factor = head(contributing_factor_vehicle_1),
freq = head(count))
tt = ggplot(data=hg, aes(x= reorder(factor, freq), y = freq, color= factor))+
geom_bar(stat="identity", fill = "#093146") +
facet_wrap(.~borough, scales = "free", ncol = 1)+
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())+
coord_flip()
ggplotly(tt, height = 800, width=1000)
# plot_list = hg$borough %>% as.data.frame() %>% distinct()
# plot = list()
# for (i in 1:5){
# zs =
# hg %>% filter(borough == plot_list[i, 1]) %>%
# ggplot(aes(x= reorder(factor, freq), y = freq, color= factor))+
# geom_bar(stat="identity", fill = "#093146")+
# coord_flip()
#
# plot[[i]] = zs
# }
# for (i in 1:5){
# print(plot[i])
# }
hg = head(
raw %>% group_by(contributing_factor_vehicle_1) %>% drop_na(contributing_factor_vehicle_1) %>% summarise(count = n()) %>%ungroup() %>%
as.data.frame() %>%
arrange(desc(count))
)
tt = ggplot(data=hg, aes(x= reorder(contributing_factor_vehicle_1, count), y = count, color= contributing_factor_vehicle_1))+
geom_bar(stat="identity", fill = "#093146") +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())+
coord_flip()
ggplotly(tt)
hg = raw %>% group_by(borough, vehicle_type_code_1) %>% drop_na(borough) %>% summarise(count = n()) %>%ungroup() %>%
as.data.frame() %>%
arrange(desc(count))
hg = hg %>% arrange(desc(count)) %>% group_by(borough) %>%
summarise(factor = head(vehicle_type_code_1),
freq = head(count))
tt = ggplot(data=hg, aes(x= reorder(factor, freq), y = freq, color= factor))+
geom_bar(stat="identity", fill = "#093146") +
facet_wrap(.~borough, scales = "free", ncol = 1)+
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())+
coord_flip()
ggplotly(tt, height = 800, width=1000)
Here you should
Here you should
Insert your reflection on how data wrangling helped you explore your research questions. (Don’t forget to adjust information at the top of report regarding your name in the author field etc!!)